Still under constructions.
(III) Detailed List
- Read and load each line of source code of all the 250 movies.
- Add Content Rating, Number of Rater, Genre, Budget, Opening Weekend USA, Gross USA and Cumulative Worldwide Gross by reading each movie’s link.
- The data was collected on 2020-10-29.
# get source code of a single movie
h_get_movie_source_code = function(curr_movie_link) {
curr_movie_source_code = curr_movie_link %>%
readLines(encoding = "UTF-8")
return(curr_movie_source_code)
}
get_poster_from_movie_source_code = function(movie_source_code) {
poster_start_pattern = "<div class=\"poster\">"
poster_end_pattern = "</a> </div>"
poster_start_line = movie_source_code %>%
grep(pattern = poster_start_pattern)
lines_with_poster_end_pattern
}
# get basic info json from the single movie source code
h_get_basics_from_movie_source_code = function(movie_source_code) {
json_start_pattern = "<script type=\"application/ld\\+json\">\\{"
json_end_pattern = "\\}</script>"
json_start_line = movie_source_code %>%
grep(pattern = json_start_pattern)
json_end_line = movie_source_code %>%
grep(pattern = json_end_pattern) %>%
extract(1)
json_file = movie_source_code %>%
extract(json_start_line : json_end_line)
return(json_file)
}
# get box office info from the single movie source code
h_get_box_office_from_movie_source_code = function(movie_source_code) {
box_office_start_pattern = "<h3 class=\"subheading\">Box Office</h3>"
box_office_end_pattern = "<hr />"
box_office_start_line = movie_source_code %>%
grep(pattern = box_office_start_pattern)
lines_with_box_office_end_pattern = movie_source_code %>%
grep(pattern = box_office_end_pattern)
box_office_end_line = lines_with_box_office_end_pattern %>%
extract(lines_with_box_office_end_pattern %>%
is_greater_than(box_office_start_line) %>%
which() %>%
extract(1))
box_office = movie_source_code %>%
extract(box_office_start_line : box_office_end_line)
return(box_office)
}
h_get_basics_info = function(basics) {
}
curr_source_code = m_link[1] %>%
h_get_movie_source_code()
curr_basics = curr_source_code %>%
h_get_basics_from_movie_source_code()
curr_box_office = curr_source_code %>%
h_get_box_office_from_movie_source_code()
curr_basics %>% cat()
<script type="application/ld+json">{ "@context": "http://schema.org", "@type": "Movie", "url": "/title/tt0111161/", "name": "The Shawshank Redemption", "image": "https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_.jpg", "genre": "Drama", "contentRating": "R", "actor": [ { "@type": "Person", "url": "/name/nm0000209/", "name": "Tim Robbins" }, { "@type": "Person", "url": "/name/nm0000151/", "name": "Morgan Freeman" }, { "@type": "Person", "url": "/name/nm0348409/", "name": "Bob Gunton" }, { "@type": "Person", "url": "/name/nm0006669/", "name": "William Sadler" } ], "director": { "@type": "Person", "url": "/name/nm0001104/", "name": "Frank Darabont" }, "creator": [ { "@type": "Person", "url": "/name/nm0000175/", "name": "Stephen King" }, { "@type": "Person", "url": "/name/nm0001104/", "name": "Frank Darabont" }, { "@type": "Organization", "url": "/company/co0040620/" } ], "description": "The Shawshank Redemption is a movie starring Tim Robbins, Morgan Freeman, and Bob Gunton. Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.", "datePublished": "1994-09-23", "keywords": "wrongful imprisonment,based on the works of stephen king,prison,escape from prison,voice over narration", "aggregateRating": { "@type": "AggregateRating", "ratingCount": 2299184, "bestRating": "10.0", "worstRating": "1.0", "ratingValue": "9.3" }, "review": { "@type": "Review", "itemReviewed": { "@type": "CreativeWork", "url": "/title/tt0111161/" }, "author": { "@type": "Person", "name": "speedreid" }, "dateCreated": "2001-02-08", "inLanguage": "English", "name": "Prepare to be moved", "reviewBody": "I have never seen such an amazing film since I saw The Shawshank Redemption. Shawshank encompasses friendships, hardships, hopes, and dreams. And what is so great about the movie is that it moves you, it gives you hope. Even though the circumstances between the characters and the viewers are quite different, you don\u0027t feel that far removed from what the characters are going through.\n\nIt is a simple film, yet it has an everlasting message. Frank Darabont didn\u0027t need to put any kind of outlandish special effects to get us to love this film, the narration and the acting does that for him. Why this movie didn\u0027t win all seven Oscars is beyond me, but don\u0027t let that sway you to not see this film, let its ranking on the IMDb\u0027s top 250 list sway you, let your friends recommendation about the movie sway you.\n\nSet aside a little over two hours tonight and rent this movie. You will finally understand what everyone is talking about and you will understand why this is my all time favorite movie." }, "duration": "PT2H22M", "trailer": { "@type": "VideoObject", "name": "Official Trailer", "embedUrl": "/video/imdb/vi3877612057", "thumbnail": { "@type": "ImageObject", "contentUrl": "https://m.media-amazon.com/images/M/MV5BNjQ2NDA3MDcxMF5BMl5BanBnXkFtZTgwMjE5NTU0NzE@._V1_.jpg" }, "thumbnailUrl": "https://m.media-amazon.com/images/M/MV5BNjQ2NDA3MDcxMF5BMl5BanBnXkFtZTgwMjE5NTU0NzE@._V1_.jpg", "description": "Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.", "uploadDate": "2014-03-05T14:13:19Z" } }</script>
curr_box_office %>% cat()
<h3 class="subheading">Box Office</h3> <div class="txt-block"> <h4 class="inline">Budget:</h4>$25,000,000 <span class="attribute">(estimated)</span> </div> <div class="txt-block"> <h4 class="inline">Opening Weekend USA:</h4> $727,327, <span class="attribute">25 September 1994</span> </div> <div class="txt-block"> <h4 class="inline">Gross USA:</h4> $28,699,976 </div> <div class="txt-block"> <h4 class="inline">Cumulative Worldwide Gross:</h4> $28,815,291 </div> <span class="see-more inline"> <a href="https://pro.imdb.com/title/tt0111161?rf=cons_tt_bo_tt&ref_=cons_tt_bo_tt" >See more on IMDbPro</a> » </span> <hr />
| Title |
h1 itemprop="name" |
| Year |
Next line of Title |
| Content Rating |
meta itemprop="contentRating" |
| User Rating |
span itemprop="ratingValue" |
| Number of Rater |
itemprop="ratingCount" |
| Genre |
span class="itemprop" itemprop="genre" |
| Budget |
<h4 class="inline">Budget |
| Opening Weekend USA ($) |
<h4 class="inline">Opening Weekend USA |
| Gross USA ($) |
<h4 class="inline">Gross |
| Cumulative Worldwide Gross ($) |
<h4 class="inline">Cumulative |
#Design function to get target information from a single page
#Each input is a website link from `movie_link`
get.target.info=function(input){
temp=readLines(con=input,encoding="UTF-8")
#1. title----
temp.movie_title=temp[grep("h1 itemprop=\"name\"",temp)]
temp.movie_title=strsplit(temp.movie_title,split=">")[[1]][2]
temp.movie_title=strsplit(temp.movie_title,split="&")[[1]][1]
#2. year----
temp.movie_year=temp[grep("h1 itemprop=\"name\"",temp)+1]
temp.movie_year=strsplit(temp.movie_year,split=">")[[1]][2]
temp.movie_year=strsplit(temp.movie_year,split="<")[[1]][1]
#3. content rating----
temp.movie_content_rating=temp[grep("meta itemprop=\"contentRating\"",temp)]
if (length(temp.movie_content_rating)==1){
temp.movie_content_rating=strsplit(temp.movie_content_rating,split=">")[[1]][2]
}
if (length(temp.movie_content_rating)==0){
temp.movie_content_rating="-"
}
#4. user rating----
temp.movie_user_rating=temp[grep("span itemprop=\"ratingValue\"",temp)]
temp.movie_user_rating=strsplit(temp.movie_user_rating,split=">")[[1]][3]
temp.movie_user_rating=strsplit(temp.movie_user_rating,split="<")[[1]][1]
#5. number of rater----
temp.movie_num_rater=temp[grep("itemprop=\"ratingCount\"",temp)]
temp.movie_num_rater=strsplit(temp.movie_num_rater,split=">")[[1]][3]
temp.movie_num_rater=strsplit(temp.movie_num_rater,split="<")[[1]][1]
#6. genre----
temp.movie_genre=temp[grep("span class=\"itemprop\" itemprop=\"genre\"",temp)]
temp.movie_genre.l=length(temp.movie_genre)
for (i in 1:temp.movie_genre.l){
temp.movie_genre[[i]]=strsplit(temp.movie_genre,split=">")[[i]][3]
temp.movie_genre[[i]]=strsplit(temp.movie_genre,split="<")[[i]][1]
}
remove(i,temp.movie_genre.l)
temp.movie_genre=paste(temp.movie_genre,collapse=", ")
#7. budget----
temp.movie_budget=temp[grep("<h4 class=\"inline\">Budget",temp)]
if (length(temp.movie_budget)==1){
temp.movie_budget=strsplit(temp.movie_budget,split=">")[[1]][3]
a=strsplit(temp.movie_budget,split="")[[1]]
if (paste(a[1],a[2],a[3],sep="")=="FRF"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="JPY"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="INR"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="DEM"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="RUR"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="TRL"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="AUD"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="KRW"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],a[4],a[5],a[6],sep="")=="€"){
temp.movie_budget=paste("EUR",substr(temp.movie_budget,start=7,stop=nchar(temp.movie_budget)))
}
if (paste(a[1],a[2],a[3],a[4],a[5],a[6],a[7],sep="")=="£"){
temp.movie_budget=paste("GBP",substr(temp.movie_budget,start=8,stop=nchar(temp.movie_budget)))
}
remove(a)
}
if (length(temp.movie_budget)==0){
temp.movie_budget="-"
}
#8. opening----
temp.movie_opening=temp[grep("<h4 class=\"inline\">Opening Weekend USA",temp)]
if (length(temp.movie_opening)==1){
temp.movie_opening=strsplit(temp.movie_opening,split=">")[[1]][3]
temp.movie_opening=strsplit(temp.movie_opening,split=" ")[[1]][2]
a=strsplit(temp.movie_opening,split="")[[1]]
if (a[length(a)]==","){
temp.movie_opening=substr(temp.movie_opening,start=1,stop=nchar(temp.movie_opening)-1)
}
remove(a)
}
if (length(temp.movie_opening)==0){
temp.movie_opening="-"
}
#9. gross----
temp.movie_gross=temp[grep("<h4 class=\"inline\">Gross",temp)]
if (length(temp.movie_gross)==1){
temp.movie_gross=strsplit(temp.movie_gross,split=">")[[1]][3]
temp.movie_gross=strsplit(temp.movie_gross,split=" ")[[1]][2]
a=strsplit(temp.movie_gross,split="")[[1]]
if (a[length(a)]==","){
temp.movie_gross=substr(temp.movie_gross,start=1,stop=nchar(temp.movie_gross)-1)
}
remove(a)
}
if (length(temp.movie_gross)==0){
temp.movie_gross="-"
}
#10. worldwide gross----
temp.movie_worldwide_gross=temp[grep("<h4 class=\"inline\">Cumulative",temp)]
if (length(temp.movie_worldwide_gross)==1){
temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=">")[[1]][3]
temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=" ")[[1]][2]
a=strsplit(temp.movie_worldwide_gross,split="")[[1]]
if (a[length(a)]==","){
temp.movie_worldwide_gross=substr(temp.movie_worldwide_gross,start=1,stop=nchar(temp.movie_worldwide_gross)-1)
}
remove(a)
}
if (length(temp.movie_worldwide_gross)==0){
temp.movie_worldwide_gross="-"
}
#11. result----
return(c(temp.movie_title,temp.movie_year,temp.movie_content_rating,temp.movie_user_rating,temp.movie_num_rater,temp.movie_genre,temp.movie_budget,temp.movie_opening,temp.movie_gross,temp.movie_worldwide_gross))
}
#Collecting data----
movie_title=c()
movie_year=c()
movie_content_rating=c()
movie_user_rating=c()
movie_num_rater=c()
movie_genre=c()
movie_budget=c()
movie_opening=c()
movie_gross=c()
movie_worldwide_gross=c()
for (i in 1:250){
temp.target.info=get.target.info(movie_link[i])
movie_title=c(movie_title,temp.target.info[1])
movie_year=c(movie_year,temp.target.info[2])
movie_content_rating=c(movie_content_rating,temp.target.info[3])
movie_user_rating=c(movie_user_rating,temp.target.info[4])
movie_num_rater=c(movie_num_rater,temp.target.info[5])
movie_genre=c(movie_genre,temp.target.info[6])
movie_budget=c(movie_budget,temp.target.info[7])
movie_opening=c(movie_opening,temp.target.info[8])
movie_gross=c(movie_gross,temp.target.info[9])
movie_worldwide_gross=c(movie_worldwide_gross,temp.target.info[10])
}
#Visualization----
library(knitr)
y=data.frame(movie_rank,movie_title,movie_year,movie_content_rating,movie_user_rating,movie_num_rater,movie_genre,movie_budget,movie_opening,movie_gross,movie_worldwide_gross)
y$movie_rank=as.character(movie_rank)
y$movie_title=as.character(movie_title)
y$movie_year=as.character(movie_year)
y$movie_content_rating=as.character(movie_content_rating)
y$movie_user_rating=as.character(movie_user_rating)
y$movie_num_rater=as.character(movie_num_rater)
y$movie_genre=as.character(movie_genre)
y$movie_budget=as.character(movie_budget)
y$movie_opening=as.character(movie_opening)
y$movie_gross=as.character(movie_gross)
y$movie_worldwide_gross=as.character(movie_worldwide_gross)
kable(y,align="c",col.names=c("Rank","Title","Year","Content Rating","User Rating","Number of Rater","Genre","Budget","Opening Weekend USA","Gross USA","Cumulative Worldwide Gross"))